In [2]:

    
%matplotlib inline



In [3]:

    
import requests
from bs4 import BeautifulSoup
import pprint

def get_content(url):
    """ Grab html content from url """
    response = requests.get(url)
    html = response.content
    return html

def find_speakers(html):
    """ Find the speakers and talk info in the html """
    soup = BeautifulSoup(html)
    speakers = soup.body.article.ul.findAll('li')
    return speakers

AJ Vicens

Mother Jones

Featured at these sessions and panels:

How I learned to take command of the command line: A journalist's guide to getting started



In [4]:

    
def parse_speakers(speakers):
    """ Parse speakers that were found in HTML """
    
    data = {}
    for row in speakers:
        # Make sure we don't have any data left from previous loop
        speaker = org = talks = None

        # Grab speaker and org
        speaker = row.find("h3").string
        if speaker:
            org = row.find('p').string

        # Grab talk titles and talk links
        talks = {}
        talk_data = row.findAll('a', href=True)
        for talk in talk_data:
            link = 'http://ire.org' + talk['href']
            title = talk.string
            talks[title] = link

        # Add everything to a dictionary
        if speaker and talks:
            data[speaker] = {
                    'org' : org,
                    'talks' : talks, 
                }
    return data



In [47]:

    
# Sample output of 5 records
#import random

# select random keys from the dictionary
#random_keys = random.sample(data.keys(), 3)

# loop over keys and select
#for key in random_keys:
#   print('\n', key)
#   pprint.pprint(data[key])



In [48]:

    
# Process borrowed from here: https://github.com/scrapinghub/pycon-speakers/blob/432499e350098c69d4b3e0f641c960d927ec596d/pycon_speakers/pipelines.py

import sexmachine.detector as gender

def get_gender(name):
    firstname = name.split()[0]
    d = gender.Detector()
    name_gender = d.get_gender(firstname)
    return name_gender

def count_genders(names):
    gender_count = {}
    names = data.keys()
    for name in names:
        name_gender = get_gender(name)
        try:
            gender_count[name_gender] += 1
        except KeyError:
            gender_count[name_gender] = 1

    print gender_count
    return gender_count



In [49]:

    
urls = (
    (2015, 'http://ire.org/conferences/nicar2015/speakers/'), 
    (2014, 'http://ire.org/conferences/nicar-2014/speakers/'),
    (2013, 'http://ire.org/conferences/nicar-2013/speakers/'),
    )



In [50]:

    
counts = {}
for url in urls:
    html = get_content(url[1])
    speakers = find_speakers(html)
    data = parse_speakers(speakers)
    count = count_genders(data)
    
    counts[url[0]] = count
    
pprint.pprint(counts)









    



{u'mostly_male': 12, u'male': 143, u'andy': 24, u'female': 69, u'mostly_female': 4}
{u'mostly_male': 13, u'male': 137, u'andy': 14, u'mostly_female': 3, u'female': 61}
{u'mostly_male': 4, u'male': 98, u'andy': 10, u'mostly_female': 2, u'female': 35}
{2013: {u'andy': 10,
        u'female': 35,
        u'male': 98,
        u'mostly_female': 2,
        u'mostly_male': 4},
 2014: {u'andy': 14,
        u'female': 61,
        u'male': 137,
        u'mostly_female': 3,
        u'mostly_male': 13},
 2015: {u'andy': 24,
        u'female': 69,
        u'male': 143,
        u'mostly_female': 4,
        u'mostly_male': 12}}



In [5]:

    
out = {2013: {u'andy': 10,
        u'female': 35,
        u'male': 98,
        u'mostly_female': 2,
        u'mostly_male': 4},
 2014: {u'andy': 14,
        u'female': 61,
        u'male': 137,
        u'mostly_female': 3,
        u'mostly_male': 13},
 2015: {u'andy': 24,
        u'female': 69,
        u'male': 143,
        u'mostly_female': 4,
        u'mostly_male': 12}}



In [6]:

    
timedata = []
for k,v in out.iteritems():
    male = v['mostly_male'] + v['male']
    female = v['mostly_female'] + v['female']
    total = male + female
    
    timedata.append((k, female*1.0/total, male*1.0/total))
print timedata









    



[(2013, 0.26618705035971224, 0.7338129496402878), (2014, 0.29906542056074764, 0.7009345794392523), (2015, 0.3201754385964912, 0.6798245614035088)]



In [7]:

    
import pandas as pd

# Turn the years into the index. 
# There is probably a better way to do this. 
years = [i[0] for i in timedata]
values = [i[1:3] for i in timedata]
df = pd.DataFrame(values, index=years)
print df









    



             0         1
2013  0.266187  0.733813
2014  0.299065  0.700935
2015  0.320175  0.679825



In [8]:

    
import matplotlib.pyplot as plt

plt.figure()
df.plot()
plt.legend(loc='best')
plt.show()









    





<matplotlib.figure.Figure at 0x106b24250>

http://glowingpython.blogspot.com/2011/07/polynomial-curve-fitting.html

https://www.google.com/search?q=Polynomial+extrapolation+python&oq=Polynomial+extrapolation+python&aqs=chrome..69i57.1074j0j1&sourceid=chrome&es_sm=91&ie=UTF-8#q=Polynomial+extrapolation+python+pandas

http://en.wikipedia.org/wiki/Extrapolation

http://en.wikipedia.org/wiki/Exponential_smoothing